{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 2.4.1 点互信息"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:06:33.254977400Z",
     "start_time": "2023-05-04T11:06:33.148979700Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def ppmi(C, verbose=False, eps=1e-8):\n",
    "    M = np.zeros_like(C, dtype=np.float32)\n",
    "    N = np.sum(C)\n",
    "    S = np.sum(C, axis=0)\n",
    "    total = C.shape[0] * C.shape[1]\n",
    "    cnt = 0\n",
    "\n",
    "    for i in range(C.shape[0]):\n",
    "        for j in range(C.shape[1]):\n",
    "            pmi = np.log2(C[i, j] * N / (S[j] * S[i] + eps))\n",
    "            M[i, j] = max(0, pmi)\n",
    "\n",
    "            if verbose:\n",
    "                cnt += 1\n",
    "                if cnt % (total // 100 + 1) == 0:\n",
    "                    print('%.1f%% done' % (100 * cnt / total))\n",
    "    return M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "import numpy as np\n",
    "from common.util import preprocess, create_co_matrix, cos_similarity, ppmi"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:07:19.955710600Z",
     "start_time": "2023-05-04T11:07:19.930713200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(word_to_id)\n",
    "C = create_co_matrix(corpus, vocab_size)\n",
    "W = ppmi(C)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:07:29.147271400Z",
     "start_time": "2023-05-04T11:07:29.131271900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "covariance matrix\n",
      "[[0 1 0 0 0 0 0]\n",
      " [1 0 1 0 1 1 0]\n",
      " [0 1 0 1 0 0 0]\n",
      " [0 0 1 0 1 0 0]\n",
      " [0 1 0 1 0 0 0]\n",
      " [0 1 0 0 0 0 1]\n",
      " [0 0 0 0 0 1 0]]\n",
      "--------------------------------------------------\n",
      "PPMI\n",
      "[[0.    1.807 0.    0.    0.    0.    0.   ]\n",
      " [1.807 0.    0.807 0.    0.807 0.807 0.   ]\n",
      " [0.    0.807 0.    1.807 0.    0.    0.   ]\n",
      " [0.    0.    1.807 0.    1.807 0.    0.   ]\n",
      " [0.    0.807 0.    1.807 0.    0.    0.   ]\n",
      " [0.    0.807 0.    0.    0.    0.    2.807]\n",
      " [0.    0.    0.    0.    0.    2.807 0.   ]]\n"
     ]
    }
   ],
   "source": [
    "np.set_printoptions(precision=3)  # 有效位数为3位\n",
    "print('covariance matrix')\n",
    "print(C)\n",
    "print('-' * 50)\n",
    "print('PPMI')\n",
    "print(W)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:07:44.285248800Z",
     "start_time": "2023-05-04T11:07:44.257225Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.4.3 基于SVD的降维"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from common.util import preprocess, create_co_matrix, ppmi"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:10:40.057137800Z",
     "start_time": "2023-05-04T11:10:39.465134200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "vocab_size = len(id_to_word)\n",
    "C = create_co_matrix(corpus, vocab_size, window_size=1)\n",
    "W = ppmi(C)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:10:45.022404400Z",
     "start_time": "2023-05-04T11:10:45.011403Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# SVD\n",
    "U, S, V = np.linalg.svd(W)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:10:52.601642800Z",
     "start_time": "2023-05-04T11:10:52.585643500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 0 0 0 0 0]\n"
     ]
    }
   ],
   "source": [
    "print(C[0])  # 共现矩阵"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:10:59.795916800Z",
     "start_time": "2023-05-04T11:10:59.789917100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.    1.807 0.    0.    0.    0.    0.   ]\n"
     ]
    }
   ],
   "source": [
    "print(W[0])  # PPMI矩阵"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:11:05.474996500Z",
     "start_time": "2023-05-04T11:11:05.459996Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-1.110e-16  3.409e-01 -1.205e-01 -4.163e-16 -1.110e-16 -9.323e-01\n",
      " -2.426e-17]\n"
     ]
    }
   ],
   "source": [
    "print(U[0])  # SVD"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:11:25.323556600Z",
     "start_time": "2023-05-04T11:11:25.307555400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "<Figure size 640x480 with 1 Axes>",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkkAAAGdCAYAAAAGx+eQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA18ElEQVR4nO3de3RU1d3/8c9M7hdmAoQkEKPhfodAIjGighoNitVoqwjUYAS03m2sLamWoPQxPoqKj1KxlEu9PSBWLE/FIE2lFYwEErGgEQVFIuQKZBICJCRzfn/4YzTNCTK5zBDyfq111ip79t7ne/aKnc/ac+aMxTAMQwAAAGjC6u0CAAAAzkSEJAAAABOEJAAAABOEJAAAABOEJAAAABOEJAAAABOEJAAAABOEJAAAABO+3i7gdDidTh04cEDdunWTxWLxdjkAAOA0GIahmpoa9enTR1Zr59uX6RQh6cCBA4qJifF2GQAAoBWKi4t1zjnneLsMt3WKkNStWzdJ3y2yzWbzcjUAAOB0VFdXKyYmxvU+3tl0ipB08iM2m81GSAIAoJPprLfKdL4PCAEAADyAkAQAAGCCkAQAAGCCkAQAADRx4kQ98MADrR4/b948xcXFuf596623atq0aW0vzIsISQAAACYISQAAACYISQAAQNJ3v3Dx61//Wj169FBUVJTmzZvneq2qqkqzZs1Sr169ZLPZdNlll+mTTz457bnr6up03333KSIiQoGBgbrooou0devWDriK9kNIAgAAkqQ///nPCgkJ0ZYtW/Tkk0/qscce04YNGyRJN954o8rLy/Xuu++qoKBAY8eO1eWXX65Dhw6d1ty//vWv9Ze//EV//vOfVVhYqAEDBiglJeW0x3sDIQkAgC7K6TRUfOioPi+tVl2DU6NGjVJWVpYGDhyotLQ0JSQkKDc3V5s2bVJ+fr5Wr16thIQEDRw4UAsWLFBYWJjefPPNHz1PbW2tXnzxRT311FO66qqrNGzYMC1ZskRBQUFaunSpB660dTrFE7cBAED72l1eo/U7y7Sn4oiONzTq20NH1XfgEO0ur9GAiO9+RqR3794qLy/XJ598oiNHjqhnz55N5jh27Jj27Nnzo+f6+uuvdeLECY0fP97V5ufnp3HjxqmoqKh9L6wdEZIAAOhidpfXaPnmvTpUW6/e9kAF+wfJ18eiqjqnlm/eq/TxsRoQ0U0Wi0VOp1NHjhxR7969tXHjxmZzhYWFebx+TyEkAQDQhTidhtbvLNOh2noNjAh1/a6aj9WqsCA/Haqt13uflqlfeKhrzNixY1VaWipfX1/Fxsa6fc6+ffvK399fmzdv1nnnnSdJOnHihLZu3dqmZzN1NO5JAgCgC9lfdUx7Ko6otz2w+Q/PWqTe9kDtLj+i/VXHXM3JyclKSkpSamqq3nvvPe3du1cffvihHn74YW3btu1HzxkSEqI777xTDz30kHJycvTZZ59p9uzZOnr0qGbOnNnel9hu2EkCAKALqa1v0PGGRgX7B5m+HuTvo7Lq46qtb3C1WSwWrVu3Tg8//LDS09NVUVGhqKgoXXLJJYqMjDyt8z7xxBNyOp265ZZbVFNTo4SEBK1fv17du3dvl+vqCBbDMAx3By1atEhPPfWUSktLNXr0aD3//PMaN26cad+JEyfqn//8Z7P2q6++Wu+8885pna+6ulp2u10Oh0M2m83dcgEAwP9XfOiont3whcKC/dQt0K/Z6zXHT6jq6An98opBiukR3KZzdfb3b7c/blu1apUyMjKUlZWlwsJCjR49WikpKSovLzft/9Zbb6mkpMR17Ny5Uz4+PrrxxhvbXDwAAHBPdFiQ+vcKVYnjuP5zn8QwDJU4jmtARKiiw8x3mroSt0PSM888o9mzZys9PV3Dhg3T4sWLFRwcrGXLlpn2P/nUzpPHhg0bFBwcTEgCAMALrFaLUkZEqkeIv74sP6Ka4yfU4HSq5vgJfVl+RD1C/HXl8EhZrZYfn+ws51ZIqq+vV0FBgZKTk7+fwGpVcnKy8vLyTmuOpUuX6uabb1ZISEiLferq6lRdXd3kAAAA7WNARDelj4/ViD52VR09ob2Vtao6ekIjo+2ur//DzRu3Kysr1djY2OwmrcjISH3++ec/Oj4/P187d+780adrZmdn69FHH3WnNAAA4IYBEd3Ub2Ko9lcdU219g0L8fRUdFsQO0g949BEAS5cu1ciRI1u8yfukzMxMORwO11FcXOyhCgEA6DqsVotiegRrSJRNMT2CCUj/wa2dpPDwcPn4+KisrKxJe1lZmaKiok45tra2VitXrtRjjz32o+cJCAhQQECAO6UBAAC0K7d2kvz9/RUfH6/c3FxXm9PpVG5urpKSkk45dvXq1aqrq9PPf/7z1lUKAADgQW4/TDIjI0MzZsxQQkKCxo0bp4ULF6q2tlbp6emSpLS0NEVHRys7O7vJuKVLlyo1NbXZj+MBAACcidwOSVOmTFFFRYXmzp2r0tJSxcXFKScnx3Uz9759+2S1Nt2g2rVrlzZt2qT33nuvfaoGAADoYK164randfYndgIA0BV19vdvfuAWAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADARKtC0qJFixQbG6vAwEAlJiYqPz//lP2rqqp09913q3fv3goICNCgQYO0bt26VhUMAADgCb7uDli1apUyMjK0ePFiJSYmauHChUpJSdGuXbsUERHRrH99fb2uuOIKRURE6M0331R0dLS++eYbhYWFtUf9AAAAHcJiGIbhzoDExESdf/75euGFFyRJTqdTMTExuvfeezVnzpxm/RcvXqynnnpKn3/+ufz8/FpVZHV1tex2uxwOh2w2W6vmAAAAntXZ37/d+ritvr5eBQUFSk5O/n4Cq1XJycnKy8szHbN27VolJSXp7rvvVmRkpEaMGKHHH39cjY2NLZ6nrq5O1dXVTQ4AAABPciskVVZWqrGxUZGRkU3aIyMjVVpaajrmq6++0ptvvqnGxkatW7dOv/vd7/T000/r97//fYvnyc7Olt1udx0xMTHulAkAANBmHf7tNqfTqYiICP3xj39UfHy8pkyZoocffliLFy9ucUxmZqYcDofrKC4u7ugyAQAAmnDrxu3w8HD5+PiorKysSXtZWZmioqJMx/Tu3Vt+fn7y8fFxtQ0dOlSlpaWqr6+Xv79/szEBAQEKCAhwpzQAAIB25dZOkr+/v+Lj45Wbm+tqczqdys3NVVJSkumY8ePHa/fu3XI6na62L774Qr179zYNSAAAAGcCtz9uy8jI0JIlS/TnP/9ZRUVFuvPOO1VbW6v09HRJUlpamjIzM13977zzTh06dEj333+/vvjiC73zzjt6/PHHdffdd7ffVQAAALQzt5+TNGXKFFVUVGju3LkqLS1VXFyccnJyXDdz79u3T1br99krJiZG69ev1y9/+UuNGjVK0dHRuv/++/Wb3/ym/a4CAACgnbn9nCRv6OzPWQAAoCvq7O/f/HYbAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACACUISAACAiVaFpEWLFik2NlaBgYFKTExUfn5+i31XrFghi8XS5AgMDGx1wQAAAJ7gdkhatWqVMjIylJWVpcLCQo0ePVopKSkqLy9vcYzNZlNJSYnr+Oabb9pUNAAAQEdzOyQ988wzmj17ttLT0zVs2DAtXrxYwcHBWrZsWYtjLBaLoqKiXEdkZGSbigYAAOhoboWk+vp6FRQUKDk5+fsJrFYlJycrLy+vxXFHjhzReeedp5iYGF133XX69NNPW18xAACAB7gVkiorK9XY2NhsJygyMlKlpaWmYwYPHqxly5bpr3/9q1599VU5nU5deOGF+vbbb1s8T11dnaqrq5scAAAAntTh325LSkpSWlqa4uLiNGHCBL311lvq1auXXnrppRbHZGdny263u46YmJiOLhMAAKAJt0JSeHi4fHx8VFZW1qS9rKxMUVFRpzWHn5+fxowZo927d7fYJzMzUw6Hw3UUFxe7UyYAAECbuRWS/P39FR8fr9zcXFeb0+lUbm6ukpKSTmuOxsZG7dixQ717926xT0BAgGw2W5MDAADAk3zdHZCRkaEZM2YoISFB48aN08KFC1VbW6v09HRJUlpamqKjo5WdnS1Jeuyxx3TBBRdowIABqqqq0lNPPaVvvvlGs2bNat8rAQAAaEduh6QpU6aooqJCc+fOVWlpqeLi4pSTk+O6mXvfvn2yWr/foDp8+LBmz56t0tJSde/eXfHx8frwww81bNiw9rsKAACAdmYxDMPwdhE/prq6Wna7XQ6Hg4/eAADoJDr7+ze/3QYAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAAGCCkAQAADRx4kQ98MADXjl3bGysFi5c6Pq3xWLR22+/7ZVafsjX2wUAAADve+utt+Tn5+ftMs4ohCQAAKAePXp4u4QzDh+3AQAATZw4UXfddZemT5+ukJAQ9e7dW88++2yTj+EOHz6stLQ0de/eXcHBwbrqqqv05ZdfNpnnL3/5i4YPH66AgACNHDmy2XnKy8v1k5/8REFBQerbt69ee+0103pKSkp01VVXKSgoSP369dObb77peu2yyy7TPffc06R/RUWF/P39lZubK0mqq6vTr371K0VHRyskJESJiYnauHGjW2tCSAIAAJKkf/3rX9q8ebPWrl2rDRs26IMPPlBhYaHr9VtvvVXbtm3T2rVrlZeXJ8MwdPXVV+vEiROSpIKCAt100026+eabtWPHDs2ZM0eSmgShW2+9VcXFxXr//ff15ptv6g9/+IPKy8ub1fK73/1OP/3pT/XJJ59o+vTpuvnmm1VUVCRJmjVrll5//XXV1dW5+r/66quKjo7WZZddJkm65557lJeXp5UrV+rf//63brzxRk2aNKlZqDsloxNwOByGJMPhcHi7FAAAzkoXXXSRYbVajdWrV7vaqqqqjODgYOP+++83vvjiC0OSsXnzZtfrlZWVRlBQkPHGG28YhmEY06ZNM6644grX6yffv4cMGWIYhmHs2rXLkGTk5+e7+hQVFRmSjGeffdbVJsn4xS9+0aS+xMRE48477zQMwzCOHTtmdO/e3Vi1apXr9VGjRhnz5s0zDMMwvvnmG8PHx8fYv39/kzkuv/xyIzMz87TXhHuSAADoohoanCosPqyDtfWqdNTI6XRq3LhxrtftdrsGDx4sSSoqKpKvr68SExNdr/fs2VODBw927fAUFRXpuuuua3aePXv2qLGx0TVHfHy867UhQ4YoLCys2ZikpKRm/96+fbskKTAwULfccouWLVumm266SYWFhdq5c6fWrl0rSdqxY4caGxs1aNCgJnPU1dWpZ8+ep70+hCQAALqg3KIyrdi8V3sP1upEo1PFh45JkjZ9WaFp557r5ep+3KxZsxQXF6dvv/1Wy5cv12WXXabzzjtPknTkyBH5+PiooKBAPj4+TcaFhoae9jm4JwkAgC4mt6hM2e9+ri/Ka9Qt0FfR3YPk7x8gWSx64pV3lFtUJklyOBz64osvJElDhw5VQ0ODtmzZ4prn4MGD2rVrl4YNG+bqs3nz5mbnGzBggHx8fDRkyBA1NDSooKDA9dquXbtUVVXVbMxHH33U7N9Dhw51/XvkyJFKSEjQkiVL9Prrr+u2225zvTZmzBg1NjaqvLxcAwYMaHJERUWd9jqxkwQAQBfS0ODUis17VXP8hM7tHiSr9bv9El9fX9miYvXF2hf1373CFf7zizV//qOyWq2yWCwaOHCgrrvuOs2ePVsvvfSSunXrpjlz5ig6Otr1EduDDz6o888/X/Pnz9eUKVP0j3/8Q5J07733SpIGDx6sSZMm6Y477tCLL74oX19fPfDAAwoKCmpW5+rVq5WQkKCLLrpIr732mvLz87V06dImfWbNmqV77rlHISEhuv76613tgwYN0vTp05WWlqann35aY8aMUUVFhXJzczVq1ChNnjz5tNaKnSQAALqQwuLD2nuwVj1D/F0B6aReA+PUo98I5T73oC6/Ilnjx4/X0KFDFRgYKElavny54uPjdc011ygpKUmGYWjdunWuh1COHTtWb7zxhlauXKkRI0bo8ccflyRNnz7ddY7ly5erT58+mjBhgm644QbdfvvtioiIaFbno48+qpUrV2rUqFF6+eWX9b//+7+uHauTpk6dKl9fX02dOtVV4w/Pk5aWpgcffFCDBw9Wamqqtm7dqnPd+CjR8v/vIj+jVVdXy263y+FwyGazebscAAA6rXd3lujRtZ8qunuQfK3N90oanE7tP3xMWdcO1yV9bYqOjtbTTz+tmTNnun2ujn7/3rt3r/r376+tW7dq7Nix7T4/H7cBANCF9Azxl5+PVcfqG9UtsGlIOrxvl8qLv5Jv1CBVfG3V9Ef+R5JMv7HmTSdOnNDBgwf1yCOP6IILLuiQgCQRkgAA6FLGxnRXbM8QfVFeoxB/nyYfuTkNQ1/mrlT9wW/1wJ8CFR8frw8++EDh4eFerLi5zZs369JLL9WgQYOaPIm7vfFxGwAAXczJb7fVHD+hniH+CvL30bH6Rh2srZct0E9zrhqiy4dGtvk8nf39m50kAAC6mJMB6ORzkg7V1svPx6rBkd0048LYdglIZwNCEgAAXdDlQyM1YWAv1xO3e4b4a2xMd/n68sX3kwhJAAB0Ub6+Vo3re/o/09HVEBcBAABMEJIAAABMEJIAAABMEJIAAABMEJIAAABMEJIAAABMtCokLVq0SLGxsQoMDFRiYqLy8/NPa9zKlStlsViUmpramtMCAAB4jNshadWqVcrIyFBWVpYKCws1evRopaSkqLy8/JTj9u7dq1/96le6+OKLW10sAACAp7gdkp555hnNnj1b6enpGjZsmBYvXqzg4GAtW7asxTGNjY2aPn26Hn30UfXr169NBQMAAHiCWyGpvr5eBQUFSk5O/n4Cq1XJycnKy8trcdxjjz2miIgIzZw587TOU1dXp+rq6iYHAACAJ7kVkiorK9XY2KjIyKY/fBcZGanS0lLTMZs2bdLSpUu1ZMmS0z5Pdna27Ha764iJiXGnTAAAgDbr0G+31dTU6JZbbtGSJUsUHh5+2uMyMzPlcDhcR3FxcQdWCQAA0JxbP3AbHh4uHx8flZWVNWkvKytTVFRUs/579uzR3r179ZOf/MTV5nQ6vzuxr6927dql/v37NxsXEBCggIAAd0oDAABoV27tJPn7+ys+Pl65ubmuNqfTqdzcXCUlJTXrP2TIEO3YsUPbt293Hddee60uvfRSbd++nY/RAADAGcutnSRJysjI0IwZM5SQkKBx48Zp4cKFqq2tVXp6uiQpLS1N0dHRys7OVmBgoEaMGNFkfFhYmCQ1awcAADiTuB2SpkyZooqKCs2dO1elpaWKi4tTTk6O62buffv2yWrlQd4AAKBzsxiGYXi7iB9TXV0tu90uh8Mhm83m7XIAAMBp6Ozv32z5AAAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAmCAkAQAAvfzyy+rZs6fq6uqatKempuqWW26RJL344ovq37+//P39NXjwYL3yyiuufnv37pXFYtH27dtdbVVVVZKkDz74oMPr7wiEJAAAoBtvvFGNjY1au3atq628vFzvvPOObrvtNq1Zs0b333+/HnzwQe3cuVN33HGH0tPT9f7773ux6o5FSAIAAAoKCtK0adO0fPlyV9urr76qc889VxMnTtSCBQt066236q677tKgQYOUkZGhG264QQsWLPBi1R2LkAQAQBfldBoqPnRUn5dWq/jQUc2cOUvvvfee9u/fL0lasWKFbr31VlksFhUVFWn8+PFNxo8fP15FRUXeKN0jfL1dAAAA8Lzd5TVav7NMeyqO6HhDowJ9fdS/l01Dho/Uyy+/rCuvvFKffvqp3nnnndOaz2r9bt/FMAxXW0NDQ4fU7insJAEA0MXsLq/R8s17tfOAQ2HBfuoXHqqwYD/tPODQORdM1pKly7R8+XIlJycrJiZGkjR06FBt3ry5yTybN2/WsGHDJEm9evWSJJWUlLhe//e//+2hK+oY7CQBANCFOJ2G1u8s06Haeg2MCJXFYpEkdQv0U2iAr+rir9D7Lz+jJUuW6OWXX3aNe+ihh3TTTTdpzJgxSk5O1v/93//prbfe0t///ndJ393TdMEFF+iJJ55Q3759VV5ert///vdeucb2wk4SAABdyP6qY9pTcUS97YGugHSSxWJRbO9w9T//MgWHhCo1NdX1Wmpqqp577jktWLBAw4cP10svvaTly5dr4sSJrj7Lli1TQ0OD4uPj9cADD+iRRx7x0FV1DIvxww8Pz1DV1dWy2+1yOByy2WzeLgcAgE7r89Jq/U/ul+oXHiofq6XZ6w1Op57LSNOlSWP1yp8Wt+lcnf39m4/bAADoQkL8fRXo66Oj9Q3qFujX5LWjNQ7t3PahDhQVaOayP3qpwjMHIQkAgC4kOixI/XuFaucBh0IDfJt85Pb0XamqrXYo9faHdMn5o71Y5ZmBkAQAQBditVqUMiJSBxzH9GX5d/cmBfn76Fh9o6YsWKseIf5KHx8rq8lHcV0NN24DANDFDIjopvTxsRrRx66qoye0t7JWVUdPaGS0XenjYzUgopu3SzwjsJMEAEAXNCCim/pNDNX+qmOqrW9QiL+vosOC2EH6AUISAABdlNVqUUyPYG+Xccbi4zYAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAATrQpJixYtUmxsrAIDA5WYmKj8/PwW+7711ltKSEhQWFiYQkJCFBcXp1deeaXVBQMAAHiC2yFp1apVysjIUFZWlgoLCzV69GilpKSovLzctH+PHj308MMPKy8vT//+97+Vnp6u9PR0rV+/vs3FAwAAdBSLYRiGOwMSExN1/vnn64UXXpAkOZ1OxcTE6N5779WcOXNOa46xY8dq8uTJmj9//mn1r66ult1ul8PhkM1mc6dcAADgJZ39/dutnaT6+noVFBQoOTn5+wmsViUnJysvL+9HxxuGodzcXO3atUuXXHJJi/3q6upUXV3d5AAAAPAkt0JSZWWlGhsbFRkZ2aQ9MjJSpaWlLY5zOBwKDQ2Vv7+/Jk+erOeff15XXHFFi/2zs7Nlt9tdR0xMjDtlAgAAtJlHvt3WrVs3bd++XVu3btV//dd/KSMjQxs3bmyxf2ZmphwOh+soLi72RJkAAAAuvu50Dg8Pl4+Pj8rKypq0l5WVKSoqqsVxVqtVAwYMkCTFxcWpqKhI2dnZmjhxomn/gIAABQQEuFMaAABAu3JrJ8nf31/x8fHKzc11tTmdTuXm5iopKem053E6naqrq3Pn1AAAAB7l1k6SJGVkZGjGjBlKSEjQuHHjtHDhQtXW1io9PV2SlJaWpujoaGVnZ0v67v6ihIQE9e/fX3V1dVq3bp1eeeUVvfjii+17JQAAAO3I7ZA0ZcoUVVRUaO7cuSotLVVcXJxycnJcN3Pv27dPVuv3G1S1tbW666679O233yooKEhDhgzRq6++qilTprTfVQAAALQzt5+T5A2d/TkLAAB0RZ39/ZvfbgMAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADBBSAIAADDRqpC0aNEixcbGKjAwUImJicrPz2+x75IlS3TxxRere/fu6t69u5KTk0/ZHwAA4EzgdkhatWqVMjIylJWVpcLCQo0ePVopKSkqLy837b9x40ZNnTpV77//vvLy8hQTE6Mrr7xS+/fvb3PxAAAAHcViGIbhzoDExESdf/75euGFFyRJTqdTMTExuvfeezVnzpwfHd/Y2Kju3bvrhRdeUFpa2mmds7q6Wna7XQ6HQzabzZ1yAQCAl3T292+3dpLq6+tVUFCg5OTk7yewWpWcnKy8vLzTmuPo0aM6ceKEevTo0WKfuro6VVdXNzkAAAA8ya2QVFlZqcbGRkVGRjZpj4yMVGlp6WnN8Zvf/EZ9+vRpErT+U3Z2tux2u+uIiYlxp0wAAIA28+i325544gmtXLlSa9asUWBgYIv9MjMz5XA4XEdxcbEHqwQAAJB83ekcHh4uHx8flZWVNWkvKytTVFTUKccuWLBATzzxhP7+979r1KhRp+wbEBCggIAAd0oDAABoV27tJPn7+ys+Pl65ubmuNqfTqdzcXCUlJbU47sknn9T8+fOVk5OjhISE1lcLAADgIW7tJElSRkaGZsyYoYSEBI0bN04LFy5UbW2t0tPTJUlpaWmKjo5Wdna2JOm///u/NXfuXL3++uuKjY113bsUGhqq0NDQdrwUAACA9uN2SJoyZYoqKio0d+5clZaWKi4uTjk5Oa6bufft2yer9fsNqhdffFH19fX62c9+1mSerKwszZs3r23VAwAAdBC3n5PkDZ39OQsAAHRFnf39m99uAwAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMEFIAgAAMNGqkLRo0SLFxsYqMDBQiYmJys/Pb7Hvp59+qp/+9KeKjY2VxWLRwoULW1srAACAx7gdklatWqWMjAxlZWWpsLBQo0ePVkpKisrLy037Hz16VP369dMTTzyhqKioNhcMAADgCW6HpGeeeUazZ89Wenq6hg0bpsWLFys4OFjLli0z7X/++efrqaee0s0336yAgIA2FwwAAOAJboWk+vp6FRQUKDk5+fsJrFYlJycrLy+v3Yqqq6tTdXV1kwMAAMCT3ApJlZWVamxsVGRkZJP2yMhIlZaWtltR2dnZstvtriMmJqbd5gYAADgdZ+S32zIzM+VwOFxHcXGxt0sCAABdjK87ncPDw+Xj46OysrIm7WVlZe16U3ZAQAD3LwEAAK9yayfJ399f8fHxys3NdbU5nU7l5uYqKSmp3YsDAADwFrd2kiQpIyNDM2bMUEJCgsaNG6eFCxeqtrZW6enpkqS0tDRFR0crOztb0nc3e3/22Weu/71//35t375doaGhGjBgQDteCgAAQPtxOyRNmTJFFRUVmjt3rkpLSxUXF6ecnBzXzdz79u2T1fr9BtWBAwc0ZswY178XLFigBQsWaMKECdq4cWPbrwAAAKADWAzDMLxdxI+prq6W3W6Xw+GQzWbzdjkAAOA0dPb37zPy220AAADeRkgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUgCAAAwQUjCWWfFihUKCwvzdhkAgE6OkAQAAGCCkAQAAGCCkNTB3nzzTY0cOVJBQUHq2bOnkpOTVVtbq61bt+qKK65QeHi47Ha7JkyYoMLCQte42267Tddcc02TuU6cOKGIiAgtXbrU05fRYXJycnTRRRcpLCxMPXv21DXXXKM9e/ZIkvbu3SuLxaK33npLl156qYKDgzV69Gjl5eU1mWPFihU699xzFRwcrOuvv14HDx70xqUAAM4yhKQOVFJSoqlTp+q2225TUVGRNm7cqBtuuEGGYaimpkYzZszQpk2b9NFHH2ngwIG6+uqrVVNTI0maNWuWcnJyVFJS4prvb3/7m44ePaopU6Z465LaXW1trTIyMrRt2zbl5ubKarXq+uuvl9PpdPV5+OGH9atf/Urbt2/XoEGDNHXqVDU0NEiStmzZopkzZ+qee+7R9u3bdemll+r3v/+9ty4HAHAWsRiGYXi7iB9TXV0tu90uh8Mhm83WLnM6nYb2Vx1TbX2DQvx9FR0WJKvV0q7zflW0Q5Mvu0h79+7Veeed9yPjnAoLC9Prr7/u2kEaPny4ZsyYoV//+teSpGuvvVY9e/bU8uXL21ynt/zYuldWVqpXr17asWOHQkND1bdvX/3pT3/SzJkzJUmfffaZhg8frqKiIg0ZMkTTpk2Tw+HQO++845rj5ptvVk5Ojqqqqjx9eQCAH+iI929PatVO0qJFixQbG6vAwEAlJiYqPz//lP1Xr16tIUOGKDAwUCNHjtS6detaVWx72V1eoxc37tGzG77Q/+R+qWc3fKEXN+7R7vKadp33vdJADR57oYaPGKkbb7xRS5Ys0eHDhyVJZWVlmj17tgYOHCi73S6bzaYjR45o3759rvlmzZrlCkRlZWV69913ddttt7WpRm8yW/dHX83VNdf/TP369ZPNZlNsbKwkNVmHUaNGuf537969JUnl5eWSpKKiIiUmJjY5T1JSUgdfCQCgK3A7JK1atUoZGRnKyspSYWGhRo8erZSUFNeb1n/68MMPNXXqVM2cOVMff/yxUlNTlZqaqp07d7a5+NbYXV6j5Zv3aucBh8KC/dQvPFRhwX7aecCh5Zv3tjoomc3bIzRQEx94TjdkvqCo8/rr+eef1+DBg/X1119rxowZ2r59u5577jl9+OGH2r59u3r27Kn6+nrXnGlpafrqq6+Ul5enV199VX379tXFF1/cXkvhUS2t+//MmaXP9pZo3pPPacuWLdqyZYskNVkHPz8/1/+2WL7bdfrhx3EAAHQEt0PSM888o9mzZys9PV3Dhg3T4sWLFRwcrGXLlpn2f+655zRp0iQ99NBDGjp0qObPn6+xY8fqhRdeaHPx7nI6Da3fWaZDtfUaGBGqboF+8rFa1C3QTwMjQnWotl7vfVomp9O9TyBPNe+gyG4KOXe4hk6epYKCQvn7+2vNmjXavHmz7rvvPl199dUaPny4AgICVFlZ2WTenj17KjU1VcuXL9eKFSuUnp7ensvhMS2tj7X+iKpKvtGoa9J1pOdQDR48xLXTdrqGDh3qClYnffTRR+1ZPgCgi/J1p3N9fb0KCgqUmZnparNarUpOTm72jaOT8vLylJGR0aQtJSVFb7/9dovnqaurU11dnevf1dXV7pTZov1Vx7Sn4oh62wNdOxInbVr7mrZ/8J6CfrtY+6uOKaZHcJvn/aboE325PU/njEhUocOmP+39SBUVFRo6dKgGDhyoV155RQkJCaqurtZDDz2koKCgZnPPmjVL11xzjRobGzVjxozWX7wXtbQ+QaF2hdjC9PWmv2pbeITecHyhZx6f59bc9913n8aPH68FCxbouuuu0/r165WTk9POVwAA6Irc2kmqrKxUY2OjIiMjm7RHRkaqtLTUdExpaalb/SUpOztbdrvddcTExLhTZotq6xt0vKFRwf7Ns2Gt47CqSotV19Co2vqGdpk3MCRUe3Zs1WuP3aUVv0zVk//1qJ5++mldddVVWrp0qQ4fPqyxY8fqlltu0X333aeIiIhmcycnJ6t3795KSUlRnz593LvgM0RL62O1WnXLb59V6VdFevmhGzXv4d/oqaeecmvuCy64QEuWLNFzzz2n0aNH67333tMjjzzSnuUDALoot3aSPCUzM7PJ7lN1dXW7BKUQf18F+vroaH2DugX6NXltUtq9Gn/TL1R19IRCTEJUa+aNPLe/7nh8qWqOn1DV0RP65RWDXDtUY8aM0datW5vM87Of/azZ3LW1tTp8+LDr212d0anWfdDYC3X3C39tsj4//MLlf375MiwsrFnbbbfd1uyG9gcffLCdrwIA0NW4tZMUHh4uHx8flZWVNWkvKytTVFSU6ZioqCi3+ktSQECAbDZbk6M9RIcFqX+vUJU4jjd7ozUMQyWO4xoQEarosOYfe3l6XqfTqfLycs2fP19hYWG69tpr3arpTNJR6w4AQEdyKyT5+/srPj5eubm5rjan06nc3NwWv3adlJTUpL8kbdiwwStf07ZaLUoZEakeIf76svyIao6fUIPTqZrjJ/Rl+RH1CPHXlcMj3X5eUkfMu2/fPkVGRur111/XsmXL5Ot7Rm76nZaOWncAADqS2w+TXLVqlWbMmKGXXnpJ48aN08KFC/XGG2/o888/V2RkpNLS0hQdHa3s7GxJ3z0CYMKECXriiSc0efJkrVy5Uo8//rgKCws1YsSI0zpnez+Mand5jdbvLNOeiiOqa2hUgK+PBkSE6srhkRoQ0e2Mm/dswfoAQNfS2R8m6fb2xJQpU1RRUaG5c+eqtLRUcXFxysnJcd2cvW/fPlmt329QXXjhhXr99df1yCOP6Le//a0GDhyot99++7QDUkcYENFN/SaGtvsTtztq3rMF6wMA6Ey67M+SAACAjtXZ37/5gVsAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAAThCQAAAATneJXU08+FLy6utrLlQAAgNN18n27E/y4h6lOEZJqamokSTExMV6uBAAAuKumpkZ2u93bZbitU/x2m9Pp1IEDB9StWzdZLGf2j6FWV1crJiZGxcXFnfJ3ajoSa3NqrM+psT6nxvq0jLU5tY5cH8MwVFNToz59+shq7Xx3+HSKnSSr1apzzjnH22W4xWaz8R9jC1ibU2N9To31OTXWp2Wszal11Pp0xh2kkzpfrAMAAPAAQhIAAIAJQlI7CwgIUFZWlgICArxdyhmHtTk11ufUWJ9TY31axtqcGuvTsk5x4zYAAICnsZMEAABggpAEAABggpAEAABggpAEAABggpDURocOHdL06dNls9kUFhammTNn6siRIz86Li8vT5dddplCQkJks9l0ySWX6NixYx6o2LNasz4TJ06UxWJpcvziF7/wUMWe1dq/H+m7J9leddVVslgsevvttzu2UC9pzfrccccd6t+/v4KCgtSrVy9dd911+vzzzz1Usee4uzaHDh3Svffeq8GDBysoKEjnnnuu7rvvPjkcDg9W7Tmt+dv54x//qIkTJ8pms8lisaiqqsozxXrAokWLFBsbq8DAQCUmJio/P/+U/VevXq0hQ4YoMDBQI0eO1Lp16zxU6ZmFkNRG06dP16effqoNGzbob3/7m/71r3/p9ttvP+WYvLw8TZo0SVdeeaXy8/O1detW3XPPPZ3yke0/pjXrI0mzZ89WSUmJ63jyySc9UK3ntXZ9JGnhwoVn/M/0tFVr1ic+Pl7Lly9XUVGR1q9fL8MwdOWVV6qxsdFDVXuGu2tz4MABHThwQAsWLNDOnTu1YsUK5eTkaObMmR6s2nNa87dz9OhRTZo0Sb/97W89VKVnrFq1ShkZGcrKylJhYaFGjx6tlJQUlZeXm/b/8MMPNXXqVM2cOVMff/yxUlNTlZqaqp07d3q48jOAgVb77LPPDEnG1q1bXW3vvvuuYbFYjP3797c4LjEx0XjkkUc8UaJXtXZ9JkyYYNx///0eqNC7Wrs+hmEYH3/8sREdHW2UlJQYkow1a9Z0cLWe15b1+aFPPvnEkGTs3r27I8r0ivZamzfeeMPw9/c3Tpw40RFlek1b1+f99983JBmHDx/uwCo9Z9y4ccbdd9/t+ndjY6PRp08fIzs727T/TTfdZEyePLlJW2JionHHHXd0aJ1norNv68KD8vLyFBYWpoSEBFdbcnKyrFartmzZYjqmvLxcW7ZsUUREhC688EJFRkZqwoQJ2rRpk6fK9pjWrM9Jr732msLDwzVixAhlZmbq6NGjHV2ux7V2fY4ePapp06Zp0aJFioqK8kSpXtGWv5+TamtrtXz5cvXt21cxMTEdVarHtcfaSJLD4ZDNZpOvb6f4Gc/T1l7rczaor69XQUGBkpOTXW1Wq1XJycnKy8szHZOXl9ekvySlpKS02P9sRkhqg9LSUkVERDRp8/X1VY8ePVRaWmo65quvvpIkzZs3T7Nnz1ZOTo7Gjh2ryy+/XF9++WWH1+xJrVkfSZo2bZpeffVVvf/++8rMzNQrr7yin//85x1drse1dn1++ctf6sILL9R1113X0SV6VWvXR5L+8Ic/KDQ0VKGhoXr33Xe1YcMG+fv7d2S5HtWWtTmpsrJS8+fPP+2PdzuT9lifs0VlZaUaGxsVGRnZpD0yMrLFtSgtLXWr/9mMkGRizpw5zW4c/s+jtTeCOp1OSd/dXJqenq4xY8bo2Wef1eDBg7Vs2bL2vIwO05HrI0m33367UlJSNHLkSE2fPl0vv/yy1qxZoz179rTjVXScjlyftWvX6h//+IcWLlzYvkV7UEf//Ujf3Y/y8ccf65///KcGDRqkm266ScePH2+nK+g4nlgbSaqurtbkyZM1bNgwzZs3r+2Fe4in1gc46ezaY20nDz74oG699dZT9unXr5+ioqKa3fjW0NCgQ4cOtfgxSO/evSVJw4YNa9I+dOhQ7du3r/VFe1BHro+ZxMRESdLu3bvVv39/t+v1tI5cn3/84x/as2ePwsLCmrT/9Kc/1cUXX6yNGze2oXLP8MTfj91ul91u18CBA3XBBReoe/fuWrNmjaZOndrW8juUJ9ampqZGkyZNUrdu3bRmzRr5+fm1tWyP8fT/95wNwsPD5ePjo7KysibtZWVlLa5FVFSUW/3Pat6+KaozO3lz4LZt21xt69evP+XNgU6n0+jTp0+zG7fj4uKMzMzMDq3X01qzPmY2bdpkSDI++eSTjijTa1qzPiUlJcaOHTuaHJKM5557zvjqq688VbpHtNffz/Hjx42goCBj+fLlHVCld7R2bRwOh3HBBRcYEyZMMGpraz1Rqle09W/nbLxx+5577nH9u7Gx0YiOjj7ljdvXXHNNk7akpKQueeM2IamNJk2aZIwZM8bYsmWLsWnTJmPgwIHG1KlTXa9/++23xuDBg40tW7a42p599lnDZrMZq1evNr788kvjkUceMQIDA8+qb9+c5O767N6923jssceMbdu2GV9//bXx17/+1ejXr59xySWXeOsSOlRr/n7+k87Sb7cZhvvrs2fPHuPxxx83tm3bZnzzzTfG5s2bjZ/85CdGjx49jLKyMm9dRodwd20cDoeRmJhojBw50ti9e7dRUlLiOhoaGrx1GR2mNf9tlZSUGB9//LGxZMkSQ5Lxr3/9y/j444+NgwcPeuMS2s3KlSuNgIAAY8WKFcZnn31m3H777UZYWJhRWlpqGIZh3HLLLcacOXNc/Tdv3mz4+voaCxYsMIqKioysrCzDz8/P2LFjh7cuwWsISW108OBBY+rUqUZoaKhhs9mM9PR0o6amxvX6119/bUgy3n///SbjsrOzjXPOOccIDg42kpKSjA8++MDDlXuGu+uzb98+45JLLjF69OhhBAQEGAMGDDAeeughw+FweOkKOlZr/35+6GwOSe6uz/79+42rrrrKiIiIMPz8/IxzzjnHmDZtmvH555976Qo6jrtrc3J3xOz4+uuvvXMRHag1/21lZWWZrs/ZsAv5/PPPG+eee67h7+9vjBs3zvjoo49cr02YMMGYMWNGk/5vvPGGMWjQIMPf398YPny48c4773i44jODxTAMwyOf6wEAAHQifLsNAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADABCEJAADAxP8D51nfR9m8K+QAAAAASUVORK5CYII="
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for word, word_id in word_to_id.items():\n",
    "    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))\n",
    "plt.scatter(U[:, 0], U[:, 1], alpha=0.5)\n",
    "plt.show()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:12:36.638994100Z",
     "start_time": "2023-05-04T11:12:36.516287Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.4.4 PTB数据集"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "from dataset import ptb"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:14:31.144364400Z",
     "start_time": "2023-05-04T11:14:31.132369100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "corpus, word_to_id, id_to_word = ptb.load_data('train')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:30:34.076324500Z",
     "start_time": "2023-05-04T11:30:33.721273300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "corpus size: 929589\n",
      "corpus[:30]: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23\n",
      " 24 25 26 27 28 29]\n"
     ]
    }
   ],
   "source": [
    "print('corpus size:', len(corpus))\n",
    "print('corpus[:30]:', corpus[:30])"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:31:00.303012100Z",
     "start_time": "2023-05-04T11:31:00.290010700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id_to_word[0]: aer\n",
      "id_to_word[1]: banknote\n",
      "id_to_word[2]: berlitz\n"
     ]
    }
   ],
   "source": [
    "print('id_to_word[0]:', id_to_word[0])\n",
    "print('id_to_word[1]:', id_to_word[1])\n",
    "print('id_to_word[2]:', id_to_word[2])"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:31:14.835405300Z",
     "start_time": "2023-05-04T11:31:14.790405900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "word_to_id['car']: 3856\n",
      "word_to_id['happy']: 4428\n",
      "word_to_id['lexus']: 7426\n"
     ]
    }
   ],
   "source": [
    "print(\"word_to_id['car']:\", word_to_id['car'])\n",
    "print(\"word_to_id['happy']:\", word_to_id['happy'])\n",
    "print(\"word_to_id['lexus']:\", word_to_id['lexus'])"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:31:28.569718600Z",
     "start_time": "2023-05-04T11:31:28.557717800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "window_size = 2\n",
    "wordvec_size = 100"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:32:11.197068Z",
     "start_time": "2023-05-04T11:32:11.179055600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "counting co-occurrence ...\n",
      "calculating PPMI ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\python2\\code7\\NLP\\01_深度学习进阶：自然语言处理\\2_自然语言和单词的分布式表示\\..\\common\\util.py:139: RuntimeWarning: overflow encountered in long_scalars\n",
      "  pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)\n",
      "D:\\python2\\code7\\NLP\\01_深度学习进阶：自然语言处理\\2_自然语言和单词的分布式表示\\..\\common\\util.py:139: RuntimeWarning: invalid value encountered in log2\n",
      "  pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0% done\n",
      "2.0% done\n",
      "3.0% done\n",
      "4.0% done\n",
      "5.0% done\n",
      "6.0% done\n",
      "7.0% done\n",
      "8.0% done\n",
      "9.0% done\n",
      "10.0% done\n",
      "11.0% done\n",
      "12.0% done\n",
      "13.0% done\n",
      "14.0% done\n",
      "15.0% done\n",
      "16.0% done\n",
      "17.0% done\n",
      "18.0% done\n",
      "19.0% done\n",
      "20.0% done\n",
      "21.0% done\n",
      "22.0% done\n",
      "23.0% done\n",
      "24.0% done\n",
      "25.0% done\n",
      "26.0% done\n",
      "27.0% done\n",
      "28.0% done\n",
      "29.0% done\n",
      "30.0% done\n",
      "31.0% done\n",
      "32.0% done\n",
      "33.0% done\n",
      "34.0% done\n",
      "35.0% done\n",
      "36.0% done\n",
      "37.0% done\n",
      "38.0% done\n",
      "39.0% done\n",
      "40.0% done\n",
      "41.0% done\n",
      "42.0% done\n",
      "43.0% done\n",
      "44.0% done\n",
      "45.0% done\n",
      "46.0% done\n",
      "47.0% done\n",
      "48.0% done\n",
      "49.0% done\n",
      "50.0% done\n",
      "51.0% done\n",
      "52.0% done\n",
      "53.0% done\n",
      "54.0% done\n",
      "55.0% done\n",
      "56.0% done\n",
      "57.0% done\n",
      "58.0% done\n",
      "59.0% done\n",
      "60.0% done\n",
      "61.0% done\n",
      "62.0% done\n",
      "63.0% done\n",
      "64.0% done\n",
      "65.0% done\n",
      "66.0% done\n",
      "67.0% done\n",
      "68.0% done\n",
      "69.0% done\n",
      "70.0% done\n",
      "71.0% done\n",
      "72.0% done\n",
      "73.0% done\n",
      "74.0% done\n",
      "75.0% done\n",
      "76.0% done\n",
      "77.0% done\n",
      "78.0% done\n",
      "79.0% done\n",
      "80.0% done\n",
      "81.0% done\n",
      "82.0% done\n",
      "83.0% done\n",
      "84.0% done\n",
      "85.0% done\n",
      "86.0% done\n",
      "87.0% done\n",
      "88.0% done\n",
      "89.0% done\n",
      "90.0% done\n",
      "91.0% done\n",
      "92.0% done\n",
      "93.0% done\n",
      "94.0% done\n",
      "95.0% done\n",
      "96.0% done\n",
      "97.0% done\n",
      "98.0% done\n",
      "99.0% done\n"
     ]
    }
   ],
   "source": [
    "corpus, word_to_id, id_to_word = ptb.load_data('train')\n",
    "vocab_size = len(word_to_id)\n",
    "print('counting co-occurrence ...')\n",
    "C = create_co_matrix(corpus, vocab_size, window_size)\n",
    "print('calculating PPMI ...')\n",
    "W = ppmi(C, verbose=True)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:35:40.452527900Z",
     "start_time": "2023-05-04T11:32:29.655157Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "calculating SVD ...\n"
     ]
    }
   ],
   "source": [
    "print('calculating SVD ...')\n",
    "try:\n",
    "    # truncated SVD (fast!)\n",
    "    from sklearn.utils.extmath import randomized_svd\n",
    "\n",
    "    U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,\n",
    "                             random_state=None)\n",
    "except ImportError:\n",
    "    # SVD (slow)\n",
    "    U, S, V = np.linalg.svd(W)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:36:22.858819800Z",
     "start_time": "2023-05-04T11:36:20.677106300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [],
   "source": [
    "word_vecs = U[:, :wordvec_size]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:36:25.439384200Z",
     "start_time": "2023-05-04T11:36:25.424383600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "outputs": [
    {
     "data": {
      "text/plain": "array([[-2.902e-11, -2.316e-07, -1.115e-06, ...,  1.803e-04,  1.620e-04,\n         3.341e-05],\n       [-1.455e-12, -8.809e-09,  1.563e-08, ...,  2.885e-04,  2.530e-04,\n         5.083e-05],\n       [-8.019e-13, -1.286e-09, -2.420e-08, ...,  4.033e-04,  3.527e-04,\n         6.528e-05],\n       ...,\n       [ 4.426e-03, -6.480e-03,  1.145e-02, ...,  1.161e-02,  2.663e-03,\n         1.013e-03],\n       [ 2.459e-03, -4.921e-03,  8.025e-03, ..., -5.063e-04,  8.922e-03,\n        -2.467e-05],\n       [ 3.888e-03, -7.842e-03,  8.080e-03, ..., -2.764e-03, -7.744e-03,\n        -1.042e-02]], dtype=float32)"
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_vecs"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:53:19.412006200Z",
     "start_time": "2023-05-04T11:53:19.399007600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [],
   "source": [
    "from common.util import most_similar"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:38:00.960347100Z",
     "start_time": "2023-05-04T11:38:00.942347Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[query] you\n",
      " i: 0.6768127679824829\n",
      " we: 0.6136245131492615\n",
      " do: 0.5688321590423584\n",
      " anybody: 0.5655081868171692\n",
      " 'll: 0.5179653763771057\n",
      "\n",
      "[query] year\n",
      " month: 0.6682701110839844\n",
      " quarter: 0.6501842737197876\n",
      " last: 0.6121847629547119\n",
      " third: 0.5968555808067322\n",
      " earlier: 0.5873135328292847\n",
      "\n",
      "[query] car\n",
      " luxury: 0.6470903754234314\n",
      " auto: 0.6370578408241272\n",
      " corsica: 0.5464427471160889\n",
      " midsized: 0.5348365902900696\n",
      " cars: 0.5322224497795105\n",
      "\n",
      "[query] toyota\n",
      " motor: 0.7418043613433838\n",
      " motors: 0.6874258518218994\n",
      " mazda: 0.5841508507728577\n",
      " nissan: 0.5772135853767395\n",
      " lexus: 0.5734121799468994\n"
     ]
    }
   ],
   "source": [
    "querys = ['you', 'year', 'car', 'toyota']\n",
    "for query in querys:\n",
    "    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-04T11:38:03.256474500Z",
     "start_time": "2023-05-04T11:38:02.535477300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
